Using DHARMa package to evaluate the fit of our negative binomial model

Using Predict to Graph the lines

# predict.glm(total_model, type = "terms",)
plot_model(total_model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120,180]")) 

Plot Residuals

  • Graph is very busy with all 47K observations

  • see 2nd plot for 5K observations for a smaller group to examine (index 25000-30000)

  • Residuals look like an amorphous blob as suggested by Abner@CSCAR

simulationOutput <- simulateResiduals(fittedModel = total_model, plot = F)
residuals(simulationOutput) %>% str()
##  num [1:47353] 0.358 0.51 0.69 0.049 0.146 ...
#there is no alpha parameter in base R plot 
residuals(simulationOutput)  %>% 
  plot(main = "Residuals plotted by numerical index", pch = ".")

# residuals(simulationOutput)[25000:30000] %>% 
#   plot(main = "Residuals for observation indices 25000 - 3000")

Plots for Residuals - QQ and DHARMa

QQ Plot

  • KS Test = Two sample Kolmogorov-Smirnov (KS) Test

    • This function tests the overall uniformity of the simulated residuals in a DHARMa object - Deviation is significant between the expected residuals and the actual observed residuals.

    • “If the P value is small, conclude that the two groups were sampled from populations with different distributions.” -Prism help page

  • Dispersion Test

    • This function performs simulation-based tests for over/underdispersion

    • Over / underdispersion means that the observed data is more / less dispersed than expected under the fitted model.

    • Deviation is significant between the observed data and fitted model.

  • Outlier Test

    • This function tests if the number of observations outside the simulation envelope are larger or smaller than expected

    • Methods generate a null expectation, and then test for an excess or lack of outliers. Per default, testOutliers() looks for both, so if you get a significant p-value, you have to check if you have to many or too few outliers.

    • See Outlier test for distribution of outliers. - Many at 1.0 residual.

plotQQunif(simulationOutput)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

plotResiduals(simulationOutput)

DHARMa::testOutliers(simulationOutput, type = "bootstrap")

## 
##  DHARMa bootstrapped outlier test
## 
## data:  simulationOutput
## outliers at both margin(s) = 538, observations = 47353, p-value <
## 2.2e-16
## alternative hypothesis: two.sided
##  percent confidence interval:
##  0.006894495 0.008584989
## sample estimates:
## outlier frequency (expected: 0.00778683504740988 ) 
##                                         0.01136148

Looking at each model by journal

#smaller model with 2 terms
two_term_glmnb <-function(model_data, model_name) {

  total_model <-MASS::glm.nb(is.referenced.by.count~ da_factor + log(age.in.months) + 
       + log(age.in.months)*da_factor + log(age.in.months)*da_factor, data = model_data, link = log)

  return(total_model)
 
}

journals <- 
  nsd_yes_metadata %>%  
  count(journal_abrev) %>%  
  filter(journal_abrev != "jmbe") 

#j <- 5

for(j in 1:nrow(journals)) { 
  journal_data <- 
  nsd_yes_metadata %>% 
    filter(journal_abrev == journals[[j,1]]) %>%  
    mutate(da_factor = factor(da))


actual_citations <- 
 journal_data %>% 
    filter(age.in.months %in% c(12, 60, 84, 120, 180)) %>% 
    group_by(age.in.months, da_factor) %>%  
   count(is.referenced.by.count, .drop = FALSE) %>% 
   summarize(mean_citations = mean(is.referenced.by.count)) %>% 
  ggplot(aes(x = da_factor, y = mean_citations, color = factor(age.in.months))) + 
  geom_point() + 
  ggtitle(paste0("Observed mean citations\nPlotted for journal ", journals[[j,1]]))
print(actual_citations)

  model <- two_term_glmnb(journal_data, journals[[j,1]])
  
#  p <-  get_model_data(model = model, type = "pred", terms = c("da_factor", 
  # "age.in.months[12,60,84,120,180]"))
  


  
  if(j == 3) { #genomea = 5
  plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[84,120,144]")) +     ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]]))
  
  p <-  get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[84,120,144]"))
  }
  else if (j == 9) { ##Mra - 7 
    plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84]")) +     ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]]))
    p <-  get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84]"))
    
  }
  else if (j == 10 | j == 11 ) { #msphere, msystems 9 years
  plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,108]")) +     ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]]))
  
  p <-  get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,108]"))
  }
  else if (j == 12 ) { 
      plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120]")) +     ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]]))
      p <-  get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120]"))
  }
  else { 
     plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120,180]")) +     ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]]))
     p <-  get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120,180]"))
  }
  
    ratio_plot <- tibble(da = p$x, predicted = p$predicted, age.in.months = as.numeric(as.character(p$group))) %>%  
    pivot_wider(names_from = da, values_from = predicted) %>%  
    mutate(ratio = `2`/`1`) %>%  
    ggplot(., aes(x = age.in.months, y = ratio)) + 
    geom_point() + 
     labs(title = paste0("Ratio of predicted citations for da = Yes to da = No \nPlotted for journal ", journals[[j,1]]), 
          y = "Ratio number of citations da=Yes/da=No")
    
 print(plot_model)
 print(ratio_plot)
 
  simulation <- simulateResiduals(fittedModel = model, plot = F)
  
  # residuals(simulation)  %>% 
  # plot(main = paste0("Residuals plotted for journal ", journals[[j,1]]), pch = ".")
  residuals <- 
   residuals(simulation) %>%  tibble(residual = .) %>%  
     ggplot(aes(y = residual, x = seq_along(residual) )) + 
     geom_point(size = 1) + 
     geom_smooth() +
     ggtitle(paste0("Residuals with geom_smooth line' \nPlotted for journal ", journals[[j,1]]))
  print(residuals)
  
  plot(simulation, sub = paste0("Simulation plotted for journal ", journals[[j,1]]))
  
  
  }
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
## Warning in newton(lsp = lsp, X = G$X, y = G$y, Eb = G$Eb, UrS = G$UrS, L = G$L,
## : Fitting terminated with step failure - check results carefully

## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details

JB 10 years

 j <-  5
  journal_data <- 
  nsd_yes_metadata %>% 
    filter(journal_abrev == journals[[j,1]]) %>%  
    mutate(da_factor = factor(da)) %>%  
    filter(age.in.months <= 120)
  


actual_citations <- 
 journal_data %>% 
    filter(age.in.months %in% c(12, 60, 84, 120)) %>% 
    group_by(age.in.months, da_factor) %>%  
   count(is.referenced.by.count, .drop = FALSE) %>% 
   summarize(mean_citations = mean(is.referenced.by.count)) %>% 
  ggplot(aes(x = da_factor, y = mean_citations, color = factor(age.in.months))) + 
  geom_point() + 
  ggtitle(paste0("Observed mean citations\nPlotted for journal ", journals[[j,1]]))
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
print(actual_citations)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

  model <- two_term_glmnb(journal_data, journals[[j,1]])
  # assign(journal_abrev[[j,1]], model)
  
  
     plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120]")) +     ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]], " for the last 10 years"))
  
  p <-  get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120,180]"))
  
  ratio_plot <- tibble(da = p$x, predicted = p$predicted, age.in.months = as.numeric(as.character(p$group))) %>%  
    pivot_wider(names_from = da, values_from = predicted) %>%  
    mutate(ratio = `2`/`1`) %>%  
    ggplot(., aes(x = age.in.months, y = ratio)) + 
    geom_point() + 
     labs(title = paste0("Ratio of predicted citations for da = Yes to da = No \nPlotted for journal ", journals[[j,1]]), 
          y = "Ratio number of citations da=Yes/da=No")

 print(plot_model)

 print(ratio_plot)

  simulation <- simulateResiduals(fittedModel = model, plot = F)
  
  # residuals(simulation)  %>% 
  # plot(main = paste0("Residuals plotted for journal ", journals[[j,1]]), pch = ".")
  
   residuals(simulation) %>%  tibble(residual = .) %>%  
     ggplot(aes(y = residual, x = seq_along(residual) )) + 
     geom_point(size = 1) + 
     geom_smooth() +
     ggtitle(paste0("Residuals with geom_smooth line' \nPlotted for journal ", journals[[j,1]], " for the last 10 years"))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

  plot(simulation, sub = paste0("Simulation plotted for journal ", journals[[j,1]]))